We begin with the standard imports:
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
In [ ]:
def make_data(N, f=0.3, rseed=1):
rand = np.random.RandomState(rseed)
x = rand.randn(N)
x[int(f * N):] += 5
return x
x = make_data(1000)
Let's plot this dataset.
In [ ]:
hist = plt.hist(x, bins=30)
Or, more properly:
In [ ]:
hist = plt.hist(x, bins=30, normed=True)
In [ ]:
density, bins, _ = hist
widths = bins[1:] - bins[:-1]
(density * widths).sum()
In [ ]:
x = make_data(20)
bins = np.linspace(-5, 10, 10)
In [ ]:
fig, ax = plt.subplots(1, 2, figsize=(12, 4), sharex=True, sharey=True,
subplot_kw={'xlim':(-4, 9), 'ylim':(-0.02, 0.3)})
fig.subplots_adjust(wspace=0.05)
for i, offset in enumerate([0.0, 0.6]):
ax[i].hist(x, bins=bins + offset, normed=True)
ax[i].plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
In [ ]:
fig, ax = plt.subplots()
bins = np.arange(-3, 8)
ax.plot(x, np.full_like(x, -0.1), '|k', markeredgewidth=1)
for count, edge in zip(*np.histogram(x, bins)):
for i in range(count):
ax.add_patch(plt.Rectangle((edge, i), 1, 1, alpha=0.5))
ax.set_xlim(-4, 8)
ax.set_ylim(-0.2, 8)
In [ ]:
x_d = np.linspace(-4, 8, 2000)
density = sum((abs(xi - x_d) < 0.5) for xi in x)
plt.fill_between(x_d, density, alpha=0.5)
plt.plot(x, np.full_like(x, -0.1), '|k', markeredgewidth=1)
plt.axis([-4, 8, -0.2, 8]);
In [ ]:
from scipy.stats import norm
x_d = np.linspace(-4, 8, 1000)
density = sum(norm(xi).pdf(x_d) for xi in x)
plt.fill_between(x_d, density, alpha=0.5)
plt.plot(x, np.full_like(x, -0.1), '|k', markeredgewidth=1)
plt.axis([-4, 8, -0.2, 5]);
sklearn.neighbors.KernelDensity
estimator, which handles KDE in multiple dimensions with one of six kernels.Let's first show a simple example of replicating the above plot using the Scikit-Learn KernelDensity
estimator:
In [ ]:
from sklearn.neighbors import KernelDensity
# instantiate and fit the KDE model
kde = KernelDensity(bandwidth=1.0, kernel='gaussian')
kde.fit(x[:, None])
# score_samples returns the log of the probability density
logprob = kde.score_samples(x_d[:, None])
plt.fill_between(x_d, np.exp(logprob), alpha=0.5)
plt.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
plt.ylim(-0.02, 0.22)
The result here is normalized such that the area under the curve is equal to 1.